library(tm)
library(tidytext)
library(tidyverse)
library(DT)
library(tidyverse)
library(tidytext)
library(DT)
library(scales)
library(wordcloud2)
library(gridExtra)
library(ngram)
library(shiny)
library(wordcloud)
library(knitr)
Is Technology Making Millennials More Happy Than Human Interation?
About a year ago Forbes Magazine published an article titled “Why Millennials are Lonely”. It claimed that “The second reason for millennial loneliness is the Internet makes it viral” effect. It emphasizes that “It’s not a coincidence that loneliness began to surge two years after Apple launched its first commercial personal computer and five years before Tim Berners-Lee invented the World Wide Web.” This is a narrative that is seen across major media organizations like New York Times and NPR. These mediums often use representations of millennials as people always on their phone with no touch with the outside world. It is also a commonly believed that, millennials don’t want relationships as pointed out by the Huffington Post, almost pointing at a lack of need of human touch by millennials.
But are millenials as lonely as they claim? Is technology the main cause this loneliness? Are millennials not attracted to human touch anymore? Let’s find out.
I looked through a database (happyDB) to identify what made people happy. I then selected a group of millennials(generation y), generation x and generation z as defined by the Pew Research Center. Genertion x as people between the ages of 38 and 53. Generation y as people between the ages of 22 and 37. And generation Z as people between the age of 1 and 21.
# Step 1 - Load the processed text data along with demographic information on contributors
# We use the processed data for our analysis and combine it with the demographic information available.
hm_data1 <- read_csv("../output/processed_moments.csv")
hm_data2 <- read_csv("../output/processed_moments.csv")
urlfile<-'https://raw.githubusercontent.com/rit-public/HappyDB/master/happydb/data/demographic.csv'
demo_data1 <- read_csv(urlfile)
demo_data2 <- read_csv(urlfile)
#print(hm_data)
# ### Combine both the data sets and keep the required columns for analysis
#
# #We select a subset of the data that satisfies specific row conditions.
# hm_data1 <- hm_data1 %>%
# inner_join(demo_data1, by = "wid") %>%
# select(wid,
# original_hm,
# gender,
# marital,
# parenthood,
# reflection_period,
# age,
# country,
# ground_truth_category,
# text) %>%
# mutate(count = sapply(hm_data1$text, wordcount)) %>%
# filter(gender %in% c("m", "f")) %>%
# filter(marital %in% c("single", "married")) %>%
# filter(parenthood %in% c("n", "y")) %>%
# filter(reflection_period %in% c("24h", "3m")) %>%
# mutate(reflection_period = fct_recode(reflection_period,
# months_3 = "3m", hours_24 = "24h"))
# dim(hm_data1)
hm_data2 <- hm_data2 %>%
inner_join(demo_data2, by = "wid") %>%
select(wid,
original_hm,
gender,
marital,
parenthood,
reflection_period,
age,
country,
ground_truth_category,
text) %>%
mutate(count = sapply(hm_data2$text, wordcount)) %>%
filter(gender %in% c("m", "f")) %>%
filter(marital %in% c("single", "married")) %>%
filter(parenthood %in% c("n", "y")) %>%
filter(reflection_period %in% c("24h", "3m")) %>%
mutate(reflection_period = fct_recode(reflection_period,
months_3 = "3m", hours_24 = "24h"))
#print(hm_data2)
#datatable(hm_data)
#names(hm_data)
#datatable(hm_data1)
datatable(hm_data2)
#Create bigrams using the text data
# hm_bigrams <- hm_data1 %>%
# filter(count != 1) %>%
# unnest_tokens(bigram, text, token = "ngrams", n = 2)
#
# bigram_counts <- hm_bigrams %>%
# separate(bigram, c("word1", "word2"), sep = " ") %>%
# count(word1, word2, sort = TRUE)
#Create bigrams using the text data
hm_bigrams2 <- hm_data2 %>%
filter(count != 1) %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2)
bigram_counts2 <- hm_bigrams2 %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
count(word1, word2, sort = TRUE)
#Create a data set with non-millenials
hm_data2$age <- as.numeric(as.character(hm_data2$age))
generationZ<-hm_data2 %>%
filter(age>=1 & age<=21)
#print(generationZ)
generationY<-hm_data2 %>%
filter(age>=22 & age<=37)
#print(generationY)
generationX<-hm_data2 %>%
filter(age>=38 & age<=53)
#print(generationX)
# dim(generationX) 14560 11
# dim(generationY)70233 11
# dim(generationZ) 5540 11
# ### Create a data set with milennials only
# hm_data1$age <- as.numeric(as.character(hm_data1$age))
# millenials<-hm_data1 %>%
# filter(age>=21 & age<=37)
# # tech_related<-read_csv("Technology_Entertainment.csv")
# # people_related<-read_csv("people-dict.csv")
# print(millenials)
# #Create a data set with non-millenials
# hm_data2$age <- as.numeric(as.character(hm_data2$age))
# names(hm_data2)
# hm_data2[age<=20 & age>=30]
#
# # non_millenials<-hm_data2 %>%
# # filter(age<=20 & age>=30)
# # print(non_millenials)
# print(hm_data2)
# dim(hm_data2)
# #Create a bag of words using the text data
### Create a bag of words using the text data
bag_of_words1 <-generationX %>%
unnest_tokens(word, text)
word_count1 <- bag_of_words1 %>%
count(word, sort = TRUE)
bag_of_words2 <- generationY %>%
unnest_tokens(word, text)
word_count2 <- bag_of_words2 %>%
count(word, sort = TRUE)
bag_of_words3 <-generationZ %>%
unnest_tokens(word, text)
word_count3 <- bag_of_words3 %>%
count(word, sort = TRUE)
Genration X
set.seed(1234)
wordcloud(words = word_count1$word, freq = word_count1$n, min.freq = 1,
max.words=10000, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Paired"), main="Generation X")

Genration Y
set.seed(1234)
wordcloud(words = word_count2$word, freq = word_count2$n, min.freq = 1,
max.words=10000, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"), main="Generation Y")

Genration X
set.seed(1234)
wordcloud(words = word_count3$word, freq = word_count3$n, min.freq = 1,
max.words=10000, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Set1") ,main="Generation Z")

As shown in the word cloud above, the most common source of happiness on those 24hours in the lives of millennials were their friends. Out of 72, 060 Millennials that answered the survey, over 8,000 of then included friends and family as their top reason for their happiness.
# install.packages("devtools")
library(devtools)
# install_github("easyGgplot2", "kassambara")
#dim(word_count)
?barplot
#barplot(head(sort(word_count,decreasing=TRUE), n = 10))
# data(word_count) # clarity is a good categorical variable
# with(word_count, barplot(rev(sort(table(n))[1:10])))
# counts <- table(word_count$n)[1:10]
# typeof(word_count)
# # #barplot(word_count)[1:10]
# # print(word_count)
# # pino2=melt(word_count,id.vars=c('word','n'))
# # ggplot(pino2,aes(x=factor(word),y=n))+geom_bar(stat='identity')
# df <- data.frame(matrix(unlist(word_count), nrow=16532, byrow=T))
# print(df)
# ggplot(df,aes(x=factor(word),y=n))+geom_bar(stat='identity')
#word_count2<-word_count[1:10]
#ggplot(word_count, aes(x=word, y=n))
# wordyu<-as.data.frame(word_count)
# ndjksbd<-wordyu[1:10]
# print(ndjksbd)
#word_count
# #ggplot(word_count, aes(x=word, y=n))
mod_word_count1<-head(word_count1, n=10)
mod_word_count2<-head(word_count2, n=10)
mod_word_count3<-head(word_count3, n=10)
#print(mod_word_count)
# barplot(mod_word_count)
library(easyGgplot2)
ggplot2.barplot(data=mod_word_count1, xName="word",yName='n',xtickLabelRotation=90,backgroundColor="lightyellow", fill='green',color="blue", ytitle="Frequency", xtitle="Words", mainTitle="Top 10 Words Used by Generation X")

ggplot2.barplot(data=mod_word_count2, xName="word",yName='n',xtickLabelRotation=90,backgroundColor="lightblue", fill='green',color="blue", ytitle="Frequency", xtitle="Words", mainTitle="Top 10 Words Used by Generation Y (Millenials)")

ggplot2.barplot(data=mod_word_count3, xName="word",yName='n',xtickLabelRotation=90,backgroundColor="lightpink", fill='green',color="blue", ytitle="Frequency", xtitle="Words", mainTitle="Top 10 Words Used by Generation Z")

library(easyGgplot2)
ggplot2.barplot(data=mod_word_count1, xName="word",yName='(n/14560)*100',xtickLabelRotation=90,backgroundColor="lightyellow", fill='green',color="blue", ytitle="Percentage", xtitle="Words", mainTitle="Top 10 Words Used by Generation X")

ggplot2.barplot(data=mod_word_count2, xName="word",yName='(n/70233)*100',xtickLabelRotation=90,backgroundColor="lightblue", fill='green',color="blue", ytitle="Percentage", xtitle="Words", mainTitle="Top 10 Words Used by Generation Y (Millenials)")

ggplot2.barplot(data=mod_word_count3, xName="word",yName='(n/5540)*100',xtickLabelRotation=90,backgroundColor="lightpink", fill='green',color="blue", ytitle="Percentage", xtitle="Words", mainTitle="Top 10 Words Used by Generation Z")

# #print(tech_related)
# #print(people_related)
#
# word_list = str_split(millenials$text, " ")
# words = unlist(word_list)
# people.matches = match(words, people_related)
# tech.matches = match(words, tech_related)
# # get the position of the matched term or NA
# # we just want a TRUE/FALSE
# people_matches = !is.na(people.matches)
# tech_matches = !is.na(tech.matches)
# # final score
# score1 = sum(people_matches)
# score2=sum(tech_matches)
# print(score1)
# print(score2)
---
title: "Project 1"
output: html_notebook
---
---
title: "R Notebook"
author: "Zuleimy Alcantara (zga2102)"
date: "9/18/2018"
output:
  html_document:
    df_print: paged
---



```{r load libraries, warning=FALSE, message=FALSE}
library(tm)
library(tidytext)
library(tidyverse)
library(DT)
library(tidyverse)
library(tidytext)
library(DT)
library(scales)
library(wordcloud2)
library(gridExtra)
library(ngram)
library(shiny) 
library(wordcloud)
library(knitr)
```

###Is Technology Making Millennials More Happy Than Human Interation?
 ![](happytech.jpg)
 

About a year ago Forbes Magazine published an article titled “Why Millennials are Lonely".  It claimed that “The second reason for millennial loneliness is the Internet makes it viral” effect. It emphasizes that “It’s not a coincidence that loneliness began to surge two years after Apple launched its first commercial personal computer and five years before Tim Berners-Lee invented the World Wide Web.”  This is a narrative that is seen across major media organizations like New York Times and NPR. These mediums often use representations of millennials as people always on their phone with no touch with the outside world. It is also a commonly believed that, millennials don’t want relationships as pointed out by the Huffington Post, almost pointing at a lack of need of human touch by millennials. ![NYT-Are You 21 to 37? You Might Be a Millennial](millenial.jpg) But are millenials as lonely as they claim? Is technology the main cause this loneliness? Are millennials not attracted to human touch anymore? Let’s find out. 

I looked through a database (happyDB) to identify what made people happy. I then selected a group of millennials(generation y), generation x and generation z as defined by the Pew Research Center. Genertion x as people between the ages of 38 and 53. Generation y as people between the ages of 22 and 37. And generation Z as people between the age of 1 and 21. 

```{r load data, warning=FALSE, message=FALSE}
# Step 1 - Load the processed text data along with demographic information on contributors

# We use the processed data for our analysis and combine it with the demographic information available.
hm_data1 <- read_csv("../output/processed_moments.csv")
hm_data2 <- read_csv("../output/processed_moments.csv")
urlfile<-'https://raw.githubusercontent.com/rit-public/HappyDB/master/happydb/data/demographic.csv'
demo_data1 <- read_csv(urlfile)
demo_data2 <- read_csv(urlfile)
#print(hm_data)
```



```{r, warning=FALSE, message=FALSE}
# ### Combine both the data sets and keep the required columns for analysis
# 
# #We select a subset of the data that satisfies specific row conditions.
# hm_data1 <- hm_data1 %>%
#   inner_join(demo_data1, by = "wid") %>%
#   select(wid,
#          original_hm,
#          gender, 
#          marital, 
#          parenthood,
#          reflection_period,
#          age, 
#          country, 
#          ground_truth_category, 
#          text) %>%
#   mutate(count = sapply(hm_data1$text, wordcount)) %>%
#   filter(gender %in% c("m", "f")) %>%
#   filter(marital %in% c("single", "married")) %>%
#   filter(parenthood %in% c("n", "y")) %>%
#   filter(reflection_period %in% c("24h", "3m")) %>%
#   mutate(reflection_period = fct_recode(reflection_period, 
#                                         months_3 = "3m", hours_24 = "24h"))
# dim(hm_data1)
```

```{r combining data, warning=FALSE, message=FALSE}

hm_data2 <- hm_data2 %>%
  inner_join(demo_data2, by = "wid") %>%
  select(wid,
         original_hm,
         gender, 
         marital, 
         parenthood,
         reflection_period,
         age, 
         country, 
         ground_truth_category, 
         text) %>%
  mutate(count = sapply(hm_data2$text, wordcount)) %>%
  filter(gender %in% c("m", "f")) %>%
  filter(marital %in% c("single", "married")) %>%
  filter(parenthood %in% c("n", "y")) %>%
  filter(reflection_period %in% c("24h", "3m")) %>%
  mutate(reflection_period = fct_recode(reflection_period, 
                                        months_3 = "3m", hours_24 = "24h"))
#print(hm_data2)
```



```{r warning=FALSE, message=FALSE}
#datatable(hm_data)
#names(hm_data)
#datatable(hm_data1)
datatable(hm_data2)
```




```{r, warning=FALSE, message=FALSE}
#Create bigrams using the text data
# hm_bigrams <- hm_data1 %>%
#   filter(count != 1) %>%
#   unnest_tokens(bigram, text, token = "ngrams", n = 2)
# 
# bigram_counts <- hm_bigrams %>%
#   separate(bigram, c("word1", "word2"), sep = " ") %>%
#   count(word1, word2, sort = TRUE)
```

```{r bigram, warning=FALSE, message=FALSE}
#Create bigrams using the text data
hm_bigrams2 <- hm_data2 %>%
  filter(count != 1) %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2)

bigram_counts2 <- hm_bigrams2 %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>%
  count(word1, word2, sort = TRUE)
```

```{r, warning=FALSE, message=FALSE}
#Create a data set with non-millenials
hm_data2$age <- as.numeric(as.character(hm_data2$age))

generationZ<-hm_data2 %>% 
  filter(age>=1 & age<=21) 
#print(generationZ)

generationY<-hm_data2 %>% 
  filter(age>=22 & age<=37)
#print(generationY)

generationX<-hm_data2 %>% 
  filter(age>=38 & age<=53) 
#print(generationX)
# dim(generationX) 14560    11
# dim(generationY)70233    11
# dim(generationZ) 5540   11
```

```{r, fig.align="center"}
# ### Create a data set with milennials only
# hm_data1$age <- as.numeric(as.character(hm_data1$age))
# millenials<-hm_data1 %>% 
#   filter(age>=21 & age<=37)
# # tech_related<-read_csv("Technology_Entertainment.csv")
# # people_related<-read_csv("people-dict.csv")
# print(millenials)


```



```{r, warning=FALSE, message=FALSE}
# #Create a data set with non-millenials
# hm_data2$age <- as.numeric(as.character(hm_data2$age))
# names(hm_data2)
# hm_data2[age<=20 & age>=30]
# 
# # non_millenials<-hm_data2 %>% 
# #   filter(age<=20 & age>=30) 
# # print(non_millenials)
# print(hm_data2)
# dim(hm_data2)
```

```{r , warning=FALSE, message=FALSE}

# #Create a bag of words using the text data

```

```{r bag of words, warning=FALSE, message=FALSE}
### Create a bag of words using the text data

bag_of_words1 <-generationX %>%
  unnest_tokens(word, text)

word_count1 <- bag_of_words1 %>%
  count(word, sort = TRUE)

bag_of_words2 <-  generationY %>%
  unnest_tokens(word, text)

word_count2 <- bag_of_words2 %>%
  count(word, sort = TRUE)

bag_of_words3 <-generationZ %>%
  unnest_tokens(word, text)

word_count3 <- bag_of_words3 %>%
  count(word, sort = TRUE)
```
## Genration X##
```{r, warning=FALSE, message=FALSE}
set.seed(1234)
wordcloud(words = word_count1$word, freq = word_count1$n, min.freq = 1,
          max.words=10000, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Paired"), main="Generation X")
```

## Genration Y##
```{r, warning=FALSE, message=FALSE}
set.seed(1234)
wordcloud(words = word_count2$word, freq = word_count2$n, min.freq = 1,
          max.words=10000, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"), main="Generation Y")
```
## Genration X##  
```{r, warning=FALSE, message=FALSE}
set.seed(1234)
wordcloud(words = word_count3$word, freq = word_count3$n, min.freq = 1,
          max.words=10000, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Set1") ,main="Generation Z")

```
As shown in the word cloud above, the most common source of happiness on those 24hours in the lives of millennials were their friends. Out of 72, 060 Millennials that answered the survey, over 8,000 of then included friends and family as their top reason for their happiness. 
```{r, warning=FALSE}
# install.packages("devtools")
library(devtools)
# install_github("easyGgplot2", "kassambara")

#dim(word_count)
?barplot
#barplot(head(sort(word_count,decreasing=TRUE), n = 10))

# data(word_count) # clarity is a good categorical variable
# with(word_count, barplot(rev(sort(table(n))[1:10])))
# counts <- table(word_count$n)[1:10]
# typeof(word_count)
# # #barplot(word_count)[1:10]
# # print(word_count)
# # pino2=melt(word_count,id.vars=c('word','n'))
# # ggplot(pino2,aes(x=factor(word),y=n))+geom_bar(stat='identity')
# df <- data.frame(matrix(unlist(word_count), nrow=16532, byrow=T))
# print(df)
# ggplot(df,aes(x=factor(word),y=n))+geom_bar(stat='identity')
#word_count2<-word_count[1:10]
#ggplot(word_count, aes(x=word, y=n))
# wordyu<-as.data.frame(word_count)
# ndjksbd<-wordyu[1:10]
# print(ndjksbd)
#word_count
# #ggplot(word_count, aes(x=word, y=n))
mod_word_count1<-head(word_count1, n=10)
mod_word_count2<-head(word_count2, n=10)
mod_word_count3<-head(word_count3, n=10)
 #print(mod_word_count)
# barplot(mod_word_count)
library(easyGgplot2)
ggplot2.barplot(data=mod_word_count1, xName="word",yName='n',xtickLabelRotation=90,backgroundColor="lightyellow", fill='green',color="blue", ytitle="Frequency", xtitle="Words", mainTitle="Top 10 Words Used by Generation X")

ggplot2.barplot(data=mod_word_count2, xName="word",yName='n',xtickLabelRotation=90,backgroundColor="lightblue", fill='green',color="blue", ytitle="Frequency", xtitle="Words", mainTitle="Top 10 Words Used by Generation Y (Millenials)")

ggplot2.barplot(data=mod_word_count3, xName="word",yName='n',xtickLabelRotation=90,backgroundColor="lightpink", fill='green',color="blue", ytitle="Frequency", xtitle="Words", mainTitle="Top 10 Words Used by Generation Z")

```
  

```{r, warning=FALSE, message=FALSE}
library(easyGgplot2)
ggplot2.barplot(data=mod_word_count1, xName="word",yName='(n/14560)*100',xtickLabelRotation=90,backgroundColor="lightyellow", fill='green',color="blue", ytitle="Percentage", xtitle="Words", mainTitle="Top 10 Words Used by Generation X")

ggplot2.barplot(data=mod_word_count2, xName="word",yName='(n/70233)*100',xtickLabelRotation=90,backgroundColor="lightblue", fill='green',color="blue", ytitle="Percentage", xtitle="Words", mainTitle="Top 10 Words Used by Generation Y (Millenials)")

ggplot2.barplot(data=mod_word_count3, xName="word",yName='(n/5540)*100',xtickLabelRotation=90,backgroundColor="lightpink", fill='green',color="blue", ytitle="Percentage", xtitle="Words", mainTitle="Top 10 Words Used by Generation Z")
# #print(tech_related)
# #print(people_related)
# 
#   word_list = str_split(millenials$text, " ")
#                    words = unlist(word_list)
#                    people.matches = match(words, people_related)
#                    tech.matches = match(words, tech_related)
#                    # get the position of the matched term or NA
#                    # we just want a TRUE/FALSE
#                    people_matches = !is.na(people.matches)
#                    tech_matches = !is.na(tech.matches)
#                    # final score
#                     score1 = sum(people_matches) 
#                     score2=sum(tech_matches)
#                     print(score1)
#                     print(score2)

```

